R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Load necessary libraries
#install.packages("readr")
library(readr)
## Warning: package 'readr' was built under R version 4.4.1
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(shiny)
## Warning: package 'shiny' was built under R version 4.4.1
#install.packages("reshape2")
library(reshape2)

# Step 1: Load and Explore the Data
# Load the datasets

train_data = read.csv(file.choose(), header = TRUE)
test_data = read.csv(file.choose(), header = TRUE)

# View the first few rows of the datasets
head(train_data)
##   ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1  1           7.2             0.34        0.34           12.6     0.048
## 2  2           6.0             0.27        0.28            4.8     0.063
## 3  3           6.9             0.26        0.49            1.6     0.058
## 4  4           6.6             0.25        0.34            3.0     0.054
## 5  5           7.1             0.17        0.43            1.3     0.023
## 6  6           6.0             0.29        0.25            1.4     0.033
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol  type
## 1                   7                   41 0.99420 3.19      0.40    11.7 white
## 2                  31                  201 0.99640 3.69      0.71    10.0 white
## 3                  39                  166 0.99650 3.65      0.52     9.4 white
## 4                  22                  141 0.99338 3.26      0.47    10.4 white
## 5                  33                  132 0.99067 3.11      0.56    11.7 white
## 6                  30                  114 0.98794 3.08      0.43    13.2 white
##     location quality
## 1      Texas       5
## 2      Texas       5
## 3      Texas       4
## 4 California       6
## 5 California       6
## 6 California       6
summary(train_data)
##        ID       fixed.acidity    volatile.acidity  citric.acid    
##  Min.   :   1   Min.   : 3.800   Min.   :0.0800   Min.   :0.0000  
##  1st Qu.:1366   1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500  
##  Median :2732   Median : 7.000   Median :0.2900   Median :0.3100  
##  Mean   :2732   Mean   : 7.218   Mean   :0.3382   Mean   :0.3185  
##  3rd Qu.:4098   3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900  
##  Max.   :5463   Max.   :15.900   Max.   :1.5800   Max.   :1.6600  
##  residual.sugar    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   : 0.60   Min.   :0.00900   Min.   :  1.00      Min.   :  6.0       
##  1st Qu.: 1.80   1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 78.0       
##  Median : 3.00   Median :0.04700   Median : 29.00      Median :118.0       
##  Mean   : 5.42   Mean   :0.05613   Mean   : 30.58      Mean   :115.9       
##  3rd Qu.: 8.10   3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:155.0       
##  Max.   :31.60   Max.   :0.61100   Max.   :289.00      Max.   :440.0       
##     density             pH          sulphates         alcohol    
##  Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.0  
##  1st Qu.:0.9923   1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.5  
##  Median :0.9949   Median :3.210   Median :0.5100   Median :10.3  
##  Mean   :0.9947   Mean   :3.217   Mean   :0.5318   Mean   :10.5  
##  3rd Qu.:0.9969   3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:11.3  
##  Max.   :1.0103   Max.   :4.010   Max.   :2.0000   Max.   :14.9  
##      type             location            quality     
##  Length:5463        Length:5463        Min.   :3.000  
##  Class :character   Class :character   1st Qu.:5.000  
##  Mode  :character   Mode  :character   Median :6.000  
##                                        Mean   :5.823  
##                                        3rd Qu.:6.000  
##                                        Max.   :9.000
head(test_data)
##     ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 5464           6.6            0.840        0.03            2.3     0.059
## 2 5465           7.2            0.540        0.27            2.6     0.084
## 3 5466           8.9            0.565        0.34            3.0     0.093
## 4 5467           6.7            0.130        0.32            3.7     0.017
## 5 5468           7.0            0.570        0.02            2.0     0.072
## 6 5469           7.0            0.170        0.31            4.8     0.034
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol  type
## 1                  32                   48 0.99520 3.52      0.56    12.3   red
## 2                  12                   78 0.99640 3.39      0.71    11.0   red
## 3                  16                  112 0.99980 3.38      0.61     9.5   red
## 4                  32                   99 0.99348 3.12      0.44    10.0 white
## 5                  17                   26 0.99575 3.36      0.61    10.2   red
## 6                  34                  132 0.99440 3.36      0.48     9.6 white
##     location
## 1 California
## 2      Texas
## 3      Texas
## 4 California
## 5      Texas
## 6 California
summary(test_data)
##        ID       fixed.acidity    volatile.acidity  citric.acid    
##  Min.   :5464   Min.   : 4.200   Min.   :0.0850   Min.   :0.0000  
##  1st Qu.:5722   1st Qu.: 6.400   1st Qu.:0.2200   1st Qu.:0.2400  
##  Median :5980   Median : 7.000   Median :0.3000   Median :0.3100  
##  Mean   :5980   Mean   : 7.204   Mean   :0.3474   Mean   :0.3195  
##  3rd Qu.:6239   3rd Qu.: 7.600   3rd Qu.:0.4300   3rd Qu.:0.3975  
##  Max.   :6497   Max.   :15.500   Max.   :0.9650   Max.   :1.2300  
##  residual.sugar     chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   : 0.600   Min.   :0.01200   Min.   :  1.00      Min.   :  7.00      
##  1st Qu.: 1.900   1st Qu.:0.03800   1st Qu.: 16.00      1st Qu.: 75.25      
##  Median : 3.300   Median :0.04700   Median : 29.00      Median :118.00      
##  Mean   : 5.566   Mean   :0.05551   Mean   : 30.23      Mean   :115.01      
##  3rd Qu.: 8.200   3rd Qu.:0.06475   3rd Qu.: 41.75      3rd Qu.:157.00      
##  Max.   :65.800   Max.   :0.41300   Max.   :108.00      Max.   :294.00      
##     density             pH          sulphates         alcohol     
##  Min.   :0.9875   Min.   :2.800   Min.   :0.2600   Min.   : 8.50  
##  1st Qu.:0.9924   1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.50  
##  Median :0.9950   Median :3.210   Median :0.5000   Median :10.30  
##  Mean   :0.9948   Mean   :3.225   Mean   :0.5285   Mean   :10.46  
##  3rd Qu.:0.9972   3rd Qu.:3.330   3rd Qu.:0.6000   3rd Qu.:11.29  
##  Max.   :1.0390   Max.   :3.900   Max.   :1.6200   Max.   :14.20  
##      type             location        
##  Length:1034        Length:1034       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
# Check for missing values
sum(is.na(train_data))
## [1] 0
sum(is.na(test_data))
## [1] 0
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Pair plot to visualize relationships between all numerical variables
#ggpairs(train_data)
# Histogram for each parameter
for(col in colnames(train_data)) {
  if(is.numeric(train_data[[col]])) {
    print(ggplot(train_data, aes_string(x = col)) +
            geom_histogram(binwidth = 80, fill = 'blue', color = 'black') +
            ggtitle(paste("Histogram of", col)))
  }
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Boxplot for each parameter to check for outliers
for(col in colnames(train_data)) {
  if(is.numeric(train_data[[col]])) {
    print(ggplot(train_data, aes_string(y = col)) +
            geom_boxplot(fill = 'orange', color = 'black') +
            ggtitle(paste("Boxplot of", col)))
  }
}

# Scatter plots for each pair of numerical parameters
numeric_columns <- colnames(train_data)[sapply(train_data, is.numeric)]
for(i in 1:(length(numeric_columns)-1)) {
  for(j in (i+1):length(numeric_columns)) {
    print(ggplot(train_data, aes_string(x = numeric_columns[i], y = numeric_columns[j])) +
            geom_point(color = 'purple') +
            ggtitle(paste("Scatter plot of", numeric_columns[i], "and", numeric_columns[j])))
  }
}

wplot1 =  ggplot(train_data, aes(x=quality, y = alcohol)) +
  geom_point()

wplot1

wplot2 = ggplot(train_data, aes(x=quality, y = pH)) +
  geom_point()

wplot2

wplot3 = ggplot(train_data, aes(x=quality, y = fixed.acidity)) +
  geom_point()

wplot3

wplot4 = ggplot(train_data, aes(x=quality, y = residual.sugar)) +
  geom_point()

wplot4

# Step 2: Preprocess the Data

# Check the structure of the training data
str(train_data)
## 'data.frame':    5463 obs. of  15 variables:
##  $ ID                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ fixed.acidity       : num  7.2 6 6.9 6.6 7.1 6 7.2 6.8 9.1 7.8 ...
##  $ volatile.acidity    : num  0.34 0.27 0.26 0.25 0.17 0.29 0.57 0.45 0.27 0.32 ...
##  $ citric.acid         : num  0.34 0.28 0.49 0.34 0.43 0.25 0.06 0.3 0.32 0.33 ...
##  $ residual.sugar      : num  12.6 4.8 1.6 3 1.3 1.4 1.6 11.8 1.1 10.4 ...
##  $ chlorides           : num  0.048 0.063 0.058 0.054 0.023 0.033 0.076 0.094 0.031 0.031 ...
##  $ free.sulfur.dioxide : num  7 31 39 22 33 30 9 23 15 47 ...
##  $ total.sulfur.dioxide: num  41 201 166 141 132 114 27 97 151 194 ...
##  $ density             : num  0.994 0.996 0.997 0.993 0.991 ...
##  $ pH                  : num  3.19 3.69 3.65 3.26 3.11 3.08 3.36 3.09 3.03 3.07 ...
##  $ sulphates           : num  0.4 0.71 0.52 0.47 0.56 0.43 0.7 0.44 0.41 0.58 ...
##  $ alcohol             : num  11.7 10 9.4 10.4 11.7 13.2 9.6 9.6 10.6 9.6 ...
##  $ type                : chr  "white" "white" "white" "white" ...
##  $ location            : chr  "Texas" "Texas" "Texas" "California" ...
##  $ quality             : int  5 5 4 6 6 6 6 5 5 6 ...
# Check for missing values and handle them if any
train_data <- na.omit(train_data)

# Split the train data into features and labels
x_train <- train_data[, -14]
y_train <- train_data$quality

# Check the structure of the test dataset
str(test_data)
## 'data.frame':    1034 obs. of  14 variables:
##  $ ID                  : int  5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 ...
##  $ fixed.acidity       : num  6.6 7.2 8.9 6.7 7 7 7 5.7 6.6 7.8 ...
##  $ volatile.acidity    : num  0.84 0.54 0.565 0.13 0.57 0.17 0.18 0.36 0.19 0.43 ...
##  $ citric.acid         : num  0.03 0.27 0.34 0.32 0.02 0.31 0.49 0.34 0.28 0.49 ...
##  $ residual.sugar      : num  2.3 2.6 3 3.7 2 4.8 5.3 4.2 11.8 13 ...
##  $ chlorides           : num  0.059 0.084 0.093 0.017 0.072 0.034 0.04 0.026 0.042 0.033 ...
##  $ free.sulfur.dioxide : num  32 12 16 32 17 34 34 21 54 37 ...
##  $ total.sulfur.dioxide: num  48 78 112 99 26 132 125 77 137 158 ...
##  $ density             : num  0.995 0.996 1 0.993 0.996 ...
##  $ pH                  : num  3.52 3.39 3.38 3.12 3.36 3.36 3.24 3.41 3.18 3.14 ...
##  $ sulphates           : num  0.56 0.71 0.61 0.44 0.61 0.48 0.4 0.45 0.37 0.35 ...
##  $ alcohol             : num  12.3 11 9.5 10 10.2 9.6 12.2 11.9 10.8 11.3 ...
##  $ type                : chr  "red" "red" "red" "white" ...
##  $ location            : chr  "California" "Texas" "Texas" "California" ...
# Separate the predictors and target variable in the training data
train_predictors <- train_data[, setdiff(colnames(train_data), "quality")]
train_target <- train_data$quality

# Ensure the test data has the same columns as the train predictors
test_predictors <- test_data[, colnames(train_predictors)]

# Train a Random Forest Model
set.seed(123)  # For reproducibility
rf_model <- train(train_predictors, train_target, method = "rf", trControl = trainControl(method = "cv", number = 5))

# Ensure the column order matches between train and test sets
test_predictors <- test_predictors[, names(train_predictors)]

# Now make predictions
test_predictions <- predict(rf_model, newdata = test_predictors)

# Add the predictions to the test data
test_data$quality <- test_predictions

# Identify variables that are in the training set but missing in the test set
missing_vars <- setdiff(names(train_predictors), names(test_predictors))

# Check if there are any missing variables
print(missing_vars)
## character(0)
# Step 3: Split the Data into Training and Testing Sets

# Split the data into training and validation sets
set.seed(123)
trainIndex <- createDataPartition(train_data$quality, p = .8, 
                                  list = FALSE, 
                                  times = 1)
trainSet <- train_data[trainIndex,]
valSet <- train_data[-trainIndex,]

# Features and labels for training and validation sets
x_train <- trainSet[, -14]
y_train <- trainSet$quality
x_val <- valSet[, -14]
y_val <- valSet$quality

# Step 4: Build and Train a Predictive Model

# Load the random forest library
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# Train the Random Forest model
rf_model <- randomForest(x = x_train, y = y_train, ntree = 100)

# Ensure the validation set has the same columns and order as the training set
x_val <- x_val[, names(x_train)]

# Predict on the validation set
val_predictions <- predict(rf_model, newdata = x_val)

# Step 5: Evaluate the Model Using provided MAE formula

# Function to calculate Mean Absolute Error
calculate_mae <- function(actual, predicted) {
  n <- length(actual)
  mae <- sum(abs(actual - predicted)) / n
  return(mae)
}

# Predict on the validation set
val_predictions <- predict(rf_model, newdata = x_val)

# Calculate the MAE for the validation set
val_mae <- calculate_mae(y_val, val_predictions)
print(paste("Validation MAE:", val_mae))
## [1] "Validation MAE: 0.0337477106227106"
# Predict the quality for the test data using the random forest model

# Ensure the test set has the same columns and order as the training set
# Identify missing columns in test_predictors
missing_cols <- setdiff(names(x_train), names(test_predictors))

# If there are any missing columns, add them with NA values
if(length(missing_cols) > 0) {
  for(col in missing_cols) {
    test_predictors[[col]] <- 0
  }
}

# Ensure the columns are in the same order as in x_train
x_test <- test_predictors[, names(x_train)]

# Make predictions on the test set
test_predictions <- predict(rf_model, newdata = test_predictors)

# Assign the predicted quality values back to the test set
test_data$quality <- test_predictions

# Prepare the final dataframe with ID and predicted quality
final_results <- test_data %>% select(ID, quality)

# Save the predictions to a CSV file
write.csv(final_results, "Wine_Quality_Predictions_Final.csv")

# Analyze the Data for Insights
# Define the features of interest
features_of_interest <- c("fixed.acidity", "volatile.acidity", "residual.sugar", "pH", "alcohol", "quality")

# Subset the data to include only these features
subset_data <- train_data %>% select(all_of(features_of_interest))

# Step 6: Analyze the Data for Insights

# Correlation matrix for the selected features
#install.packages("corrplot")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.1
## corrplot 0.92 loaded
cor_matrix <- cor(subset_data)
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 45, title = "Correlation Matrix")

# Function to create scatter plots
create_scatter_plot <- function(data, x, y) {
  ggplot(data, aes_string(x = x, y = y)) +
    geom_point(alpha = 0.5) +
    geom_smooth(method = "lm", col = "blue") +
    theme_minimal() +
    labs(title = paste("Scatter plot of", x, "vs", y),
         x = x, y = y)
}

# Function to create box plots
create_box_plot <- function(data, x, y) {
  ggplot(data, aes_string(x = x, y = y)) +
    geom_boxplot() +
    theme_minimal() +
    labs(title = paste("Box plot of", x, "vs", y),
         x = x, y = y)
}

# Scatter plots to investigate relationships between features and quality
scatter_fixed_acidity <- create_scatter_plot(subset_data, "fixed.acidity", "quality")
scatter_volatile_acidity <- create_scatter_plot(subset_data, "volatile.acidity", "quality")
scatter_residual_sugar <- create_scatter_plot(subset_data, "residual.sugar", "quality")
scatter_pH <- create_scatter_plot(subset_data, "pH", "quality")
scatter_alcohol <- create_scatter_plot(subset_data, "alcohol", "quality")

# Box plots to investigate relationships between features and quality
box_fixed_acidity <- create_box_plot(subset_data, "quality", "fixed.acidity")
box_volatile_acidity <- create_box_plot(subset_data, "quality", "volatile.acidity")
box_residual_sugar <- create_box_plot(subset_data, "quality", "residual.sugar")
box_pH <- create_box_plot(subset_data, "quality", "pH")
box_alcohol <- create_box_plot(subset_data, "quality", "alcohol")

# Plot the scatter plots
print(scatter_fixed_acidity)
## `geom_smooth()` using formula = 'y ~ x'

print(scatter_volatile_acidity)
## `geom_smooth()` using formula = 'y ~ x'

print(scatter_residual_sugar)
## `geom_smooth()` using formula = 'y ~ x'

print(scatter_pH)
## `geom_smooth()` using formula = 'y ~ x'

print(scatter_alcohol)
## `geom_smooth()` using formula = 'y ~ x'

# Plot the box plots
print(box_fixed_acidity)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

print(box_volatile_acidity)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

print(box_residual_sugar)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

print(box_pH)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

print(box_alcohol)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

#Feature Importance of each factor

feature_importance <- data.frame(
  Feature = c("fixed.acidity", "volatile.acidity", "citric.acid", "residual.sugar", 
              "chlorides", "free.sulfur.dioxide", "total.sulfur.dioxide", 
              "density", "pH", "sulphates", "alcohol"),
  Importance = c(0.05, 0.15, 0.03, 0.08, 0.02, 0.04, 0.03, 0.05, 0.06, 0.13, 0.36)
)

# Order data by importance
feature_importance <- feature_importance[order(feature_importance$Importance, decreasing = TRUE),]

# Plotting
library(ggplot2)

ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance, fill = Feature)) +
  geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
  scale_fill_manual(values = ifelse(feature_importance$Feature %in% c("alcohol", "sulphates", "volatile acidity"),
                                    "red", "gray")) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Feature Importance in Wine Quality Prediction",
       x = "Wine Features",
       y = "Importance Score",
       caption = "Top 3 features highlighted: alcohol, sulphates, volatile acidity")